Code
# Load required libraries
library(tidyverse)
library(maps)
library(ggplot2)
library(gridExtra)
library(sf)
library(viridis)
library(scales)
library(dplyr)
library(randomForest)
library(caret)
library(pdp)# Load required libraries
library(tidyverse)
library(maps)
library(ggplot2)
library(gridExtra)
library(sf)
library(viridis)
library(scales)
library(dplyr)
library(randomForest)
library(caret)
library(pdp)# Set the path to the 2017-2020 folder
#path <- "Data/AIS Fishing Effort 2017-2020"
# List all CSV files in the folder
#AIS_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE, recursive = TRUE)
# Read all CSV files and combine them into a single data frame
#AIS_fishing <- AIS_csv_files %>%
# map_df(~read_csv(.))
load(here::here("Data","AIS_fishing.Rdata"))
# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
group_by(cell_ll_lat, cell_ll_lon) %>%
summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
ungroup() %>%
# Remove any cells with zero or negative fishing hours
filter(total_fishing_hours > 0)
# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
list(
lon_std = floor(lon * 10) / 10,
lat_std = floor(lat * 10) / 10
)
}
# Standardize and aggregate AIS data
AIS_data_std <- aggregated_AIS_fishing %>%
mutate(coords = map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
mutate(
lon_std = map_dbl(coords, ~ .x$lon_std),
lat_std = map_dbl(coords, ~ .x$lat_std)
) %>%
group_by(lon_std, lat_std) %>%
summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")
# Create the world map
world_map <- map_data("world")
# Create the plot
ggplot() +
geom_map(data = world_map, map = world_map,
aes(long = long, lat = lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = AIS_data_std,
aes(x = lon_std, y = lat_std, fill = total_fishing_hours)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "AIS Fishing Effort (hours; 2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000, 100000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = NULL,
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)# Set the path to the 2016 folder
#path <- "Data/SAR Vessel detections 2017-2020"
# List all CSV files in the folder
#SAR_csv_files <- list.files(path = here::here(path), pattern = "*.csv", full.names = TRUE)
# Read all CSV files and combine them into a single data frame
#SAR_fishing <- SAR_csv_files %>%
# map_df(~read_csv(.))
load(here::here("Data","SAR_fishing.Rdata"))
# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
mutate(
lat_rounded = round(lat, digits = 2),
lon_rounded = round(lon, digits = 2)
) %>%
group_by(lat_rounded, lon_rounded) %>%
filter(fishing_score >= 0.9) %>%
summarise(
total_presence_score = sum(presence_score, na.rm = TRUE),
avg_fishing_score = mean(fishing_score, na.rm = TRUE),
count = n()
) %>%
mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
ungroup()
# Standardize and aggregate SAR data
SAR_data_std <- aggregated_SAR_fishing %>%
mutate(coords = map2(lon_rounded, lat_rounded, standardize_coords)) %>%
mutate(
lon_std = map_dbl(coords, ~ .x$lon_std),
lat_std = map_dbl(coords, ~ .x$lat_std)
) %>%
group_by(lon_std, lat_std) %>%
summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")
# Create the world map
world_map <- map_data("world")
# Create the plot
ggplot() +
geom_map(data = world_map, map = world_map,
aes(long = long, lat = lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = SAR_data_std,
aes(x = lon_std, y = lat_std, fill = total_presence_score)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "Fishing vessel detections (2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = NULL,
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)# Merge the datasets
combined_data <- full_join(
AIS_data_std,
SAR_data_std,
by = c("lon_std", "lat_std")
)
# Categorize each cell
combined_data <- combined_data %>%
mutate(category = case_when(
total_fishing_hours > 0 & total_presence_score > 0 ~ "Both AIS and SAR",
total_fishing_hours > 0 & (is.na(total_presence_score) | total_presence_score == 0) ~ "Only AIS",
(is.na(total_fishing_hours) | total_fishing_hours == 0) & total_presence_score > 0 ~ "Only SAR",
TRUE ~ "No fishing detected"
))
# Create the world map
world_map <- map_data("world")
# Create the plot
world_plot <- ggplot() +
geom_map(data = world_map, map = world_map,
aes(long = long, lat = lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = combined_data,
aes(x = lon_std, y = lat_std, fill = category)) +
scale_fill_manual(
values = c("Both AIS and SAR" = "purple",
"Only AIS" = "blue",
"Only SAR" = "red",
"No fishing detected" = "white"),
name = "Fishing Data Source",
guide = guide_legend(title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = "Global Fishing Detection",
subtitle = "Comparison of AIS (2017-2020) and SAR (2017-2020) data at 0.1-degree resolution",
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)
# Display the plot
print(world_plot)# Get summary statistics
summary_stats <- combined_data %>%
count(category) %>%
mutate(percentage = n / sum(n) * 100)
print(summary_stats)# A tibble: 3 × 3
category n percentage
<chr> <int> <dbl>
1 Both AIS and SAR 163095 9.12
2 Only AIS 1566190 87.6
3 Only SAR 58668 3.28
#---- Open the AIS data
#load(here::here("Data", "AIS_fishing.Rdata"))
# Aggregate fishing hours by latitude and longitude
aggregated_AIS_fishing <- AIS_fishing %>%
group_by(cell_ll_lat, cell_ll_lon) %>%
summarise(total_fishing_hours = sum(fishing_hours, na.rm = TRUE)) %>%
ungroup() %>%
# Remove any cells with zero or negative fishing hours
filter(total_fishing_hours > 0)
#---- Open the SAR data
#load(here::here("Data", "SAR_fishing.Rdata"))
# Aggregate fishing hours by latitude and longitude
aggregated_SAR_fishing <- SAR_fishing %>%
mutate(
lat_rounded = round(lat, digits = 2),
lon_rounded = round(lon, digits = 2)
) %>%
# filter(matched_category == "fishing") %>%
group_by(lat_rounded, lon_rounded) %>%
filter(fishing_score >= 0.9) %>%
summarise(
total_presence_score = sum(presence_score, na.rm = TRUE),
avg_fishing_score = mean(fishing_score, na.rm = TRUE),
count = n()
) %>%
mutate(total_presence_score = round(total_presence_score, digits = 0)) %>%
ungroup()
# Function to standardize coordinates to 0.1 degree resolution
standardize_coords <- function(lon, lat) {
list(
lon_std = floor(lon * 10) / 10,
lat_std = floor(lat * 10) / 10
)
}
# Aggregate AIS data to 0.1 degree resolution
AIS_data_01deg <- aggregated_AIS_fishing %>%
mutate(coords = purrr::map2(cell_ll_lon, cell_ll_lat, standardize_coords)) %>%
mutate(
lon_std = sapply(coords, function(x) x$lon_std),
lat_std = sapply(coords, function(x) x$lat_std)
) %>%
group_by(lon_std, lat_std) %>%
summarise(total_fishing_hours = sum(total_fishing_hours, na.rm = TRUE), .groups = "drop")
# Aggregate SAR data to 0.1 degree resolution
SAR_data_01deg <- aggregated_SAR_fishing %>%
mutate(coords = purrr::map2(lon_rounded, lat_rounded, standardize_coords)) %>%
mutate(
lon_std = sapply(coords, function(x) x$lon_std),
lat_std = sapply(coords, function(x) x$lat_std)
) %>%
group_by(lon_std, lat_std) %>%
summarise(total_presence_score = sum(total_presence_score, na.rm = TRUE), .groups = "drop")
# Merge the datasets
combined_data_01deg <- full_join(
AIS_data_01deg,
SAR_data_01deg,
by = c("lon_std", "lat_std")
) %>%
mutate(
has_AIS = !is.na(total_fishing_hours) & total_fishing_hours > 0,
has_SAR = !is.na(total_presence_score) & total_presence_score > 0
)
# Separate the data into training (both AIS and SAR) and prediction (only SAR) sets
training_data <- combined_data_01deg %>%
filter(has_AIS & has_SAR) %>%
select(total_fishing_hours, total_presence_score, lon_std, lat_std)
prediction_data <- combined_data_01deg %>%
filter(!has_AIS & has_SAR) %>%
select(total_presence_score, lon_std, lat_std)
# Train the random forest model with timing
#set.seed(123) # for reproducibility
#rf_timing <- system.time({
# rf_model_01deg <- randomForest(
# total_fishing_hours ~ total_presence_score + lon_std + lat_std,
# data = training_data,
# ntree = 500,
# importance = TRUE
# )
#})
# Print the time taken
#print(paste("Random Forest model training time:", rf_timing["elapsed"], "seconds"))
#save(rf_model_01deg,file="rf_model_01deg.Rdata")
load(here::here("rf_model_01deg.Rdata"))
#Visualise results
# Make predictions
predictions <- predict(rf_model, newdata = prediction_data)
# Add predictions to the original dataset
combined_data_01deg <- combined_data_01deg %>%
mutate(
predicted_fishing_hours = case_when(
has_AIS ~ total_fishing_hours,
has_SAR ~ predict(rf_model, newdata = select(., total_presence_score, lon_std, lat_std)),
TRUE ~ 0
)
)
# Create the world map
world_map <- map_data("world")
#Map of predicted fishing hours only
# Prepare the data for the map
map_data <- combined_data_01deg %>%
filter(!has_AIS & has_SAR) %>%
select(lon_std, lat_std, predicted_fishing_hours)
# Create the map
predicted_SAR_only_plot <- ggplot() +
geom_map(data = world_map, map = world_map,
aes(long, lat, map_id = region),
color = "darkgray", fill = "lightgray", size = 0.1) +
geom_tile(data = map_data,
aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "Predicted fishing hours (2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = "Predicted Fishing Hours in Areas with Only SAR Detections",
subtitle = "0.1 degree resolution",
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)
# Print the map
print(predicted_SAR_only_plot)#Map of both original and predicted AIS fishing hours
# Visualize the results
predicted_plot <- ggplot() +
geom_map(data = world_map, map = world_map,
aes(long, lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = combined_data_01deg,
aes(x = lon_std, y = lat_std, fill = predicted_fishing_hours)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "Predicted fishing hours (2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = "Global Predicted Fishing Hours (0.1 degree resolution)",
subtitle = "Based on AIS data and Random Forest predictions from SAR data",
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)
print(predicted_plot)#Aggregate data to 1 degree resolution
# First, round the coordinates to the nearest degree
combined_data_1deg <- combined_data_01deg %>%
mutate(
lon_1deg = round(lon_std),
lat_1deg = round(lat_std)
)
# Aggregate the data
aggregated_data <- combined_data_1deg %>%
group_by(lon_1deg, lat_1deg) %>%
summarise(
predicted_fishing_hours = sum(predicted_fishing_hours, na.rm = TRUE),
.groups = 'drop'
)
# Create the world map
world_map <- map_data("world")
# Create the map
predicted_plot_1deg <- ggplot() +
geom_map(data = world_map, map = world_map,
aes(long, lat, map_id = region),
color = "black", fill = "lightgray", size = 0.1) +
geom_tile(data = aggregated_data,
aes(x = lon_1deg, y = lat_1deg, fill = predicted_fishing_hours)) +
scale_fill_viridis(
option = "inferno",
direction = -1,
trans = "log1p",
name = "Predicted fishing hours (2017-2020)",
breaks = c(0, 1, 10, 100, 1000, 10000, 100000, 1000000),
labels = scales::comma,
guide = guide_colorbar(barwidth = 20, barheight = 0.5, title.position = "top", title.hjust = 0.5)
) +
coord_fixed(1.3) +
theme_minimal() +
labs(title = "Global Predicted Fishing Hours (1 degree resolution)",
subtitle = "Based on AIS data and Random Forest predictions from SAR data made at 0.1 degree resolution",
x = "Longitude", y = "Latitude") +
theme(
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vertical",
legend.margin = ggplot2::margin(t = 20, r = 0, b = 0, l = 0),
legend.title = element_text(margin = ggplot2::margin(b = 10))
)
# Print the map
print(predicted_plot_1deg)# Evaluate the model
# Function to evaluate models
evaluate_model <- function(model, data, log_target = FALSE) {
predictions <- predict(model, newdata = data)
if (log_target) {
predictions <- 10^predictions - 1
}
actual <- data$total_fishing_hours
# Basic Error Metrics
mae <- mean(abs(actual - predictions), na.rm = TRUE)
rmse <- sqrt(mean((actual - predictions)^2, na.rm = TRUE))
mape <- mean(abs((actual - predictions) / actual) * 100, na.rm = TRUE)
medae <- median(abs(actual - predictions), na.rm = TRUE)
# R-squared and Adjusted R-squared
ss_total <- sum((actual - mean(actual))^2, na.rm = TRUE)
ss_residual <- sum((actual - predictions)^2, na.rm = TRUE)
r_squared <- 1 - (ss_residual / ss_total)
n <- length(actual)
p <- length(model$forest$independent.variable.names) # Number of predictors
adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
# Residual Analysis
residuals <- actual - predictions
mean_residual <- mean(residuals, na.rm = TRUE)
sd_residual <- sd(residuals, na.rm = TRUE)
# Feature Importance (for Random Forest)
feature_importance <- importance(model)
return(list(
MAE = mae, # Mean Absolute Error
RMSE = rmse, # Root Mean Squared Error
MAPE = mape, # Mean Absolute Percentage Error
MedAE = medae, # Median Absolute Error
R_squared = r_squared, # R-Squared (Coefficient of Determination)
Adjusted_R_squared = adj_r_squared, # Adjusted R-Squared
Mean_Residual = mean_residual, # Mean of Residuals
SD_Residual = sd_residual, # Standard Deviation of Residuals
Feature_Importance = feature_importance # Feature Importance (for Random Forest)
))
}
# Merge the datasets
combined_data_01deg <- full_join(
AIS_data_01deg,
SAR_data_01deg,
by = c("lon_std", "lat_std")
) %>%
mutate(
has_AIS = !is.na(total_fishing_hours) & total_fishing_hours > 0,
has_SAR = !is.na(total_presence_score) & total_presence_score > 0,
data_category = case_when(
has_AIS & has_SAR ~ "Both AIS and SAR",
has_AIS & !has_SAR ~ "Only AIS",
!has_AIS & has_SAR ~ "Only SAR",
TRUE ~ "No fishing detected"
)
)
# Evaluate all models
validation_data <- combined_data_01deg %>% filter(data_category == "Both AIS and SAR")
results_no_transform <- evaluate_model(rf_model, validation_data)
print(results_no_transform)$MAE
[1] 275.048
$RMSE
[1] 1177.043
$MAPE
[1] 1965.5
$MedAE
[1] 55.19061
$R_squared
[1] 0.9165729
$Adjusted_R_squared
[1] 0.9165729
$Mean_Residual
[1] -0.1617396
$SD_Residual
[1] 1177.047
$Feature_Importance
%IncMSE IncNodePurity
total_presence_score 98.83689 987162725907
lon_std 43.65497 791743926811
lat_std 38.67563 696033312224
Model evaluation interpretation:
Strengths: The model explains a large portion of the variance in the data (high R² and Adjusted R²), and the median error (MedAE) is reasonably low, suggesting that the model is accurate for many of the predictions.
Weaknesses: The high RMSE, MAPE, and SD of residuals indicate the presence of some large errors and possible outliers that are skewing the results. The very high MAPE is particularly concerning, suggesting that the model may struggle with predictions for certain instances, possibly due to the nature of the data (e.g., low actual values leading to large percentage errors).
Feature Importance: The model relies heavily on the total_presence_score feature, with location features also playing significant roles.
To improve the model, consider investigating the outliers and errors, potentially refining the model or using alternative modeling approaches that might handle these cases better.